In [ ]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
In [2]:
%matplotlib inline
plt.rcParams['figure.figsize'] = 6, 4.5
plt.rcParams['axes.grid'] = True
plt.gray()
In [3]:
cd ..
In [4]:
import train
import json
import imp
First, have to load the settings from the json file:
In [5]:
settings = json.load(open('SETTINGS.json', 'r'))
In [6]:
settings.keys()
Out[6]:
In [7]:
data = train.get_data(settings['FEATURES'])
Doing this we get a dictionary of dictionarys:
In [10]:
type(data)
Out[10]:
In [11]:
data.keys()
Out[11]:
In [12]:
data['raw_feat_var_'].keys()
Out[12]:
In [13]:
data['raw_feat_var_']['Patient_2'].keys()
Out[13]:
In [14]:
data['raw_feat_var_']['Patient_2']['interictal'].keys()
Out[14]:
It's dictionaries all the way down.
Until you get to the feature vectors, obviously:
In [16]:
data['raw_feat_var_']['Patient_2']['interictal']['Patient_2_interictal_segment_0034.mat']
Out[16]:
Unfortunately, we want a feature matrix and a target vector to shove into whatever machine learning code we want to use. Should be pretty easy to get that out of the above data structure though. Requirements of this code:
Prototyping this function in this notebook, then will save to utils.py
.
In [19]:
import numpy as np
In [51]:
def buildtraining(subject,features,data):
"""Function to build data structures for ML:
* __Input__: subject, features
* __Output___: X feature matrix, y target vector
It will not tell you which feature is which."""
# hacking this for later
first = features[0]
for feature in features:
Xf = np.array([])
# enumerate to get numbers for target vector:
# 0 is interictal
# 1 is preictal
for i,ictal in enumerate(['interictal','preictal']):
for segment in data[feature][subject][ictal].keys():
# now stack up the feature vectors
try:
Xf = np.vstack([Xf,data[feature][subject][ictal][segment].T])
except ValueError:
Xf = data[feature][subject][ictal][segment].T
# and stack up the target vector
# but only for the first feature (will be the same for the rest)
if feature == first:
try:
y.append(i)
except NameError:
y = [i]
# stick the X arrays together
try:
X = np.hstack([X,Xf])
except NameError:
X = Xf
except ValueError:
print(feature)
print(X.shape,Xf.shape)
# turn y into an array
y = np.array(y)
return X,y
How the enumerate works:
In [21]:
for i,x in enumerate(['interictal','preictal']):
print(i,x)
Testing the above:
In [41]:
X,y = buildtraining('Dog_1',['raw_feat_var_','raw_feat_cov_'],data)
In [43]:
X.shape
Out[43]:
In [44]:
y.shape
Out[44]:
Appears to have worked.
Attempting on all features.
In [52]:
X,y = buildtraining('Dog_1',list(data.keys()),data)
Caught the above errors, looks like those two features are a bit weird. Maybe they're not coming in as vectors?
Should probably just flatten them.
In [13]:
def buildtraining(subject,features,data):
"""Function to build data structures for ML:
* __Input__: subject, features
* __Output___: X feature matrix, y target vector
It will not tell you which feature is which."""
# hacking this for later
first = features[0]
for feature in features:
Xf = np.array([])
# enumerate to get numbers for target vector:
# 0 is interictal
# 1 is preictal
for i,ictal in enumerate(['interictal','preictal']):
for segment in data[feature][subject][ictal].keys():
# now stack up the feature vectors
try:
Xf = np.vstack([Xf,np.ndarray.flatten(data[feature][subject][ictal][segment].T)])
except ValueError:
Xf = np.ndarray.flatten(data[feature][subject][ictal][segment].T)
# and stack up the target vector
# but only for the first feature (will be the same for the rest)
if feature == first:
try:
y.append(i)
except NameError:
y = [i]
# stick the X arrays together
try:
X = np.hstack([X,Xf])
except NameError:
X = Xf
except ValueError:
print(feature)
print(X.shape,Xf.shape)
# turn y into an array
y = np.array(y)
return X,y
In [14]:
X,y = buildtraining('Dog_1',list(data.keys()),data)
Ok, now appears to work.
In [82]:
%save buildtraining.py 56
In [8]:
import sklearn.preprocessing
import sklearn.pipeline
import sklearn.ensemble
import sklearn.cross_validation
import sklearn.svm
In [9]:
scaler = sklearn.preprocessing.StandardScaler()
forest = sklearn.ensemble.RandomForestClassifier()
model = sklearn.pipeline.Pipeline([('scl',scaler),('clf',forest)])
In [10]:
svc = sklearn.svm.SVC()
modelsvc = sklearn.pipeline.Pipeline([('scl',scaler),('clf',svc)])
Starting with default settings:
In [15]:
tenfold = sklearn.cross_validation.StratifiedKFold(y,n_folds=10)
In [16]:
sklearn.cross_validation.cross_val_score(model,X,y,cv=tenfold)
Out[16]:
In [17]:
X.shape
Out[17]:
Not that impressive, when you look at the all zeros for this one.
In [80]:
1-sum(y)/len(y)
Out[80]:
Trying increasing the number of trees we're using.
In [18]:
model.set_params(clf__n_estimators=3000)
Out[18]:
In [19]:
%%time
sklearn.cross_validation.cross_val_score(model,X,y,cv=tenfold)
Out[19]:
In [165]:
sklearn.cross_validation.cross_val_score(modelsvc,X,y,cv=tenfold)
Out[165]:
In [94]:
%%time
sklearn.cross_validation.cross_val_score(model,X,y,cv=tenfold,scoring='roc_auc')
Out[94]:
In [166]:
sklearn.cross_validation.cross_val_score(modelsvc,X,y,cv=tenfold,scoring='roc_auc')
Out[166]:
Well, that's not below 0.5, so that's good enough for a submission. Time to go ahead and do that.
So, I'll need another function like the one above to create a test matrix for each subject. Then, I can iterate over subjects, training the model and then classifying the test.
In [20]:
def buildtest(subject,features,data):
"""Function to build data structures for submission:
* __Input__: subject, features, data
* __Output___: X feature matrix, labels
It will not tell you which feature is which."""
Xd = {}
for feature in features:
for segment in data[feature][subject]['test'].keys():
fvector = np.ndarray.flatten(data[feature][subject]['test'][segment])
try:
Xd[segment] = np.hstack([Xd[segment],fvector])
except:
Xd[segment] = fvector
# make the X array and corresponding labels
segments = []
X = []
for segment in Xd.keys():
segments.append(segment)
X.append(Xd[segment])
X = np.vstack(X)
return X,segments
In [23]:
features = list(data.keys())
subjects = list(data[features[0]].keys())
Had to remove this feature as it didn't cover all subjects for some reason.
In [22]:
features.remove('raw_feat_xcorr_')
In [24]:
X,segments = buildtest(subjects[0],features,data)
Works, but not saving as I want to reorganise the code I already saved as well.
In [18]:
model.set_params(clf__n_estimators=3000)
Out[18]:
In [26]:
%%time
predictiondict = {}
for subj in subjects:
# training step
X,y = buildtraining(subj,features,data)
model.fit(X,y)
# prediction step
X,segments = buildtest(subj,features,data)
predictions = model.predict_proba(X)
for segment,prediction in zip(segments,predictions):
predictiondict[segment] = prediction
Running for SVC as well.
In [167]:
%%time
svcpredictiondict = {}
for subj in subjects:
# training step
X,y = buildtraining(subj,features,data)
modelsvc.fit(X,y)
# prediction step
X,segments = buildtest(subj,features,data)
predictions = modelsvc.predict(X)
for segment,prediction in zip(segments,predictions):
svcpredictiondict[segment] = prediction
Trying Logistic regression as we now have many more features.
In [32]:
import sklearn.linear_model
In [33]:
logreg = sklearn.linear_model.LogisticRegression()
modellr = sklearn.pipeline.Pipeline([('scl',scaler),('clf',logreg)])
In [36]:
%%time
lrpredictiondict = {}
for subj in subjects:
# training step
X,y = buildtraining(subj,features,data)
modellr.fit(X,y)
# prediction step
X,segments = buildtest(subj,features,data)
predictions = modellr.predict_proba(X)
for segment,prediction in zip(segments,predictions):
lrpredictiondict[segment] = prediction
In [27]:
import csv
In [30]:
with open("output/protosubmission.csv","w") as f:
c = csv.writer(f)
c.writerow(['clip','preictal'])
for seg in predictiondict.keys():
c.writerow([seg,"%s"%predictiondict[seg][-1]])
In [31]:
!head output/protosubmission.csv
In [154]:
!wc -l output/protosubmission.csv
Looks like it's the right length. Submitted now and we got 0.59308 for it, which isn't too bad.
Saving LR as well:
In [37]:
with open("output/protolr.csv","w") as f:
c = csv.writer(f)
c.writerow(['clip','preictal'])
for seg in predictiondict.keys():
c.writerow([seg,"%s"%lrpredictiondict[seg][-1]])
In [170]:
!head output/protosvc.csv